# import key libraries
import pandas as pd
import plotly.express as px
from copy import copy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from jupyterthemes import jtplot
jtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False)
# Data Source: https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data/notebooks?datasetId=29&sortBy=voteCount
# Read files
temperature_df = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
temperature_df
temperature_df.shape
temperature_df.describe()
# Check the unique countries
temperature_df['Country'].unique()
# Check for missing values
temperature_df.isnull().sum()
# Check the dataframe information
temperature_df.info()
# Do groupby country to see the count of values available for each country
country_group_df = temperature_df.groupby(by = 'Country').count().reset_index('Country').rename(columns={'AverageTemperature':'AverageTemperatureCount','AverageTemperatureUncertainty' : 'AverageTemperatureUncertaintyCount'})
country_group_df
country_group_df['Country']
import plotly.express as px
fig = px.bar(country_group_df, x = 'Country', y = 'AverageTemperatureCount')
fig.show()
fig = px.bar(country_group_df, x = 'Country', y = 'AverageTemperatureUncertaintyCount')
fig.show()
# Plot histogram
fig = px.histogram(country_group_df, x = "AverageTemperatureCount")
fig.show()
fig = px.histogram(country_group_df, x = "AverageTemperatureUncertaintyCount")
fig.show()
country_group_df[(country_group_df['AverageTemperatureCount'] < 1500) | (country_group_df['AverageTemperatureUncertaintyCount'] < 1500)]
# Find countries with less than 1500 data info
countries_with_less_data = country_group_df[(country_group_df['AverageTemperatureCount'] < 1500) | (country_group_df['AverageTemperatureUncertaintyCount'] < 1500)].index.tolist()
countries_with_less_data
[8, 73, 80, 91, 98, 161, 204]
~temperature_df['Country'].isin(countries_with_less_data)
# Remove countries with less data info
temperature_df = temperature_df[~temperature_df['Country'].isin(countries_with_less_data)]
temperature_df.reset_index(inplace = True, drop = True)
temperature_df
# Fill missing values by doing rolling average on past 730 days
temperature_df['AverageTemperature'] = temperature_df['AverageTemperature'].fillna(temperature_df['AverageTemperature'].rolling(730, min_periods = 1).mean())
# Fill missing values by doing rolling average on past 730 days
temperature_df['AverageTemperatureUncertainty']= temperature_df['AverageTemperatureUncertainty'].fillna(temperature_df['AverageTemperatureUncertainty'].rolling(730, min_periods=1).mean())
temperature_df.isna().sum()
temperature_df['Country'].unique()
duplicates = []
for i in temperature_df['Country'].unique():
if '(' in i:
duplicates.append(i)
duplicates
# replace duplicates
temperature_df = temperature_df.replace(duplicates, ['Congo', 'Denmark','Falkland Islands','France','Netherlands','United Kingdom'])
temperature_df['Country'].unique()
countries = temperature_df['Country'].unique().tolist()
countries
# mean temperature for each country
mean_temperature = []
for i in countries:
mean_temperature.append(temperature_df[temperature_df['Country'] == i]['AverageTemperature'].mean())
# Plot mean teamperature of countries
data = [ dict(type = 'choropleth', # type of map
locations = countries, # location names
z = mean_temperature, # temperature of countries
locationmode = 'country names')
]
layout = dict(title = 'Average Global Land Temperatures',
geo = dict(showframe = False,
showocean = True, # to show the ocean
oceancolor = 'aqua',
projection = dict(type = 'orthographic'))) # to get the globe view),
fig = dict(data = data, layout = layout)
py.iplot(fig, validate = False, filename = 'worldmap')
# year of recorded data for visualization
temperature_df['year'] = temperature_df['dt'].apply(lambda x: x.split('-')[0])
temperature_df
# creating the animation to see the global temperature change
fig = px.choropleth(temperature_df, locations = 'Country',
locationmode = 'country names', # locations
color = 'AverageTemperature', # column representing the temperature
hover_name = "Country", # column to add to hover information
animation_frame = 'year', # timeframe for animation
color_continuous_scale = px.colors.sequential.deep_r)
# py.plot(fig)
fig.show()